#### setting environment ####
require(quanteda)
require(stm)

newspaper.names <- c("LAT", "NYT", "SFC", "WP", "WSJ", "WT")

#### data preparation ####
# document-feature matrix of newspaper editorials published from December 1, 2019 to December 31, 2019
load("US_dfm.Rdata")
# convering the document-feature matrix to stm format
STM.data <- convert(us.dfm, to = "stm")

## number of editorials (Table A.7)
table(docvars(us.dfm, "newspaper"))

#### estimating the structural topic models and choosing the number of topics (Figure A.9) ####
## estimating the models varying the number of topics from 10 to 40
n.topics <- seq(10, 40, 1)

# estimation
for (i in 1:length(n.topics)) {
  set.seed(n.topics[i])
  STM.result <- stm(documents = STM.data$documents, 
                    vocab = STM.data$vocab, 
                    K = n.topics[i], 
                    prevalence = ~ s(numeric.date), 
                    data = STM.data$meta)
  save(STM.result, file = paste0(foldername, "US_STM_result_", n.topics[i], ".Rdata"))
  print(paste0("Estimation of a model in which the number of topic is ", n.topics[i], " is completed at ", date(), "."))
}

# compute exclusivity and semantic coherence
exclusivity.evaluation.1 <- semantic.coherence.evaluation.1 <- rep(NA, length(n.topics))
for (i in 1:length(n.topics)) {
  load(paste0("US_STM_result_", n.topics[i], ".Rdata"))
  exclusivity.evaluation.1[i] <- mean(exclusivity(STM.result))
  semantic.coherence.evaluation.1[i] <- mean(semanticCoherence(STM.result, STM.data$documents))
}

# draw the figure
cairo_pdf("Figure_A9.pdf",
          width = 5.6, height = 3, pointsize = 8, family = "Helvetica")
layout(matrix(1:2, 1, 2))
par(mar = c(4, 4, 3, 2))
plot(c(1:length(n.topics)), exclusivity.evaluation.1, type = "b", 
     main = "Exclusivity", xlab = "Number of Topics", ylab = "Average Exclusivity", xaxt = "n")
axis(1, at = seq(1, length(n.topics), 5), 
     labels = n.topics[seq(1, length(n.topics), 5)], cex.axis = 0.9)
plot(c(1:length(n.topics)), semantic.coherence.evaluation.1, type = "b", 
     main = "Semantic Coherence", xlab = "Number of Topics", ylab = "Average Semantic Coherence", xaxt = "n")
axis(1, at = seq(1, length(n.topics), 5), 
     labels = n.topics[seq(1, length(n.topics), 5)], cex.axis = 0.9)
dev.off()